{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_data(path):\n",
    "    user = pd.read_csv(path + 'user.csv',header=None)\n",
    "    item = pd.read_csv(path + 'item.csv',header=None)\n",
    "    data = pd.read_csv(path + 'user_behavior.csv',header=None)\n",
    "\n",
    "    data.columns = ['userID','itemID','behavior','timestamp']\n",
    "    data['day'] = data['timestamp'] // 86400\n",
    "    data['hour'] = data['timestamp'] // 3600 % 24\n",
    "\n",
    "    ## 生成behavior的加权\n",
    "    data['day_hour'] = data['day'] + data['hour'] / float(24)\n",
    "    data.loc[data['behavior']=='pv','behavior'] = 1\n",
    "    data.loc[data['behavior']=='fav','behavior'] = 2\n",
    "    data.loc[data['behavior']=='cart','behavior'] = 3\n",
    "    data.loc[data['behavior']=='buy','behavior'] = 1\n",
    "    max_day = max(data['day'])\n",
    "    min_day = min(data['day'])\n",
    "    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] \n",
    "\n",
    "    item.columns = ['itemID','category','shop','brand']\n",
    "    user.columns = ['userID','sex','age','ability']\n",
    "\n",
    "    data = pd.merge(left=data, right=item, on='itemID',how='left')\n",
    "    data = pd.merge(left=data, right=user, on='userID',how='left')\n",
    "\n",
    "    return user, item, data\n",
    "    \n",
    "def get_unique_inorder(x, k=50):\n",
    "    result = []\n",
    "    flag = set()\n",
    "    for i in x:\n",
    "        if i[0] not in flag:\n",
    "            result.append(i)\n",
    "            flag.add(i[0])\n",
    "        if len(flag) > k:\n",
    "            break\n",
    "    return result\n",
    "\n",
    "def get_recall_list(train, targetDay, k=300):\n",
    "    train_logs = dict()\n",
    "    \n",
    "    if targetDay > max(train['day']):\n",
    "        for row in train[['userID','itemID','behavior']].values:\n",
    "            train_logs.setdefault(row[0], dict())\n",
    "            if row[1] in upward_map:\n",
    "                train_logs[row[0]].setdefault(upward_map[row[1]],0)\n",
    "                train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])\n",
    "    else:\n",
    "        user_List_test = set(train.loc[train['day']==targetDay,'userID'])\n",
    "        train = train[train['day'] < targetDay]\n",
    "        \n",
    "        for row in train[['userID','itemID','behavior']].values:\n",
    "            if row[0] in user_List_test:\n",
    "                train_logs.setdefault(row[0], dict())\n",
    "                if row[1] in upward_map:\n",
    "                    train_logs[row[0]].setdefault(upward_map[row[1]],0)\n",
    "                    train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])\n",
    "\n",
    "    for each_user in train_logs:\n",
    "        sum_value = sum(train_logs[each_user].values())\n",
    "        if sum_value > 0:\n",
    "            for each_item in train_logs[each_user]:\n",
    "                train_logs[each_user][each_item] /= sum_value            \n",
    "\n",
    "    result_logs = dict()    \n",
    "    for u in train_logs:\n",
    "        result_logs.setdefault(u, list())\n",
    "        for i in set(train_logs[u].keys()):\n",
    "            if i in item_dict:\n",
    "                tmp_list = [ (x[0], train_logs[u][i]*x[1]) for x in item_dict[i]]\n",
    "                result_logs[u] += tmp_list\n",
    "            \n",
    "    for u in result_logs:\n",
    "        result_logs[u] = get_unique_inorder([(downward_map[x[0]], x[1]) for x in sorted(result_logs[u], key=lambda x:x[1], reverse=True)\n",
    "                          if x[0] not in train_logs[u]], k=300)  \n",
    "    \n",
    "    return result_logs\n",
    "\n",
    "\n",
    "def generate_pairs(recall):\n",
    "    result = []\n",
    "    for u in recall:\n",
    "        for i in recall[u]:\n",
    "            result.append([u,i[0],i[1]])\n",
    "    return result\n",
    "\n",
    "def reshape_recall_to_dataframe(recall):\n",
    "    result = generate_pairs(recall)\n",
    "    result = pd.DataFrame(result)\n",
    "    result.columns = ['userID','itemID','apriori']\n",
    "    return result\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#path = './'\n",
    "path = '../ECommAI_EUIR_round2_train_20190816/'\n",
    "\n",
    "## The target date(16 means online, 15 means underline test, 14 means underline train)\n",
    "targetday = 15\n",
    "\n",
    "## The lenth of recall list, the default is 300\n",
    "lenth = 300\n",
    "\n",
    "## The name of generated recall file\n",
    "\n",
    "name = 'recall_list_round2_%dday_%dlenth.csv'%(targetday, lenth)\n",
    "\n",
    "\n",
    "user, item, data = load_data(path = path)   \n",
    "\n",
    "#tempory_path = './tempory_file/'\n",
    "tempory_path = './'\n",
    "f = open(tempory_path + 'upward_map.txt','r')\n",
    "\n",
    "upward_map = f.read()\n",
    "upward_map = eval(upward_map)\n",
    "f.close()\n",
    "    \n",
    "f = open(tempory_path + 'downward_map.txt','r')\n",
    "downward_map = f.read()\n",
    "downward_map = eval(downward_map)\n",
    "f.close()\n",
    "\n",
    "f = open(tempory_path + 'item_Apriori.txt','r')\n",
    "tmp = f.read()\n",
    "item_dict = eval(tmp)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "recall_logs = get_recall_list(data, targetDay=targetday, k=lenth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "recall_df = reshape_recall_to_dataframe(recall_logs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = pd.merge(left=recall_df, right=data[data['day'] == targetday][['userID','itemID','behavior']], \n",
    "         on=['userID','itemID'], how='left').rename(columns={'behavior':'label'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(set(recall_df['userID']) & set(data[data['day'] == targetday]['userID']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(set(recall_df['userID']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "recall_df.to_csv(name, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
